/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.db; import java.io.*; import java.util.*; import java.util.logging.*; import net.nutch.io.*; import net.nutch.util.*; /********************************************************* * The EditSectionGroupReader will read in an edits-file that * was built in a distributed way. It acts as a "meta-SequenceFile", * incorporating knowledge of Section numbering as well as * process-synchronization. If you had different ideas * about how to make the db-edits distributed (apart from using * NFS), you'd implement them here. * * @author Mike Cafarella *********************************************************/ public class EditSectionGroupReader { static final Logger LOG = LogFormatter.getLogger("net.nutch.db.EditSectionGroupReader"); private final static String MERGED_EDITS = "merged_edits"; private final static int SLEEP_INTERVAL = 3000; private final static int WORRY_INTERVALS = 5; NutchFileSystem nutchfs; String dbName, label; int readerNum = -1, totalMachines = -1, numEdits = 0; boolean sectionComplete = false; /** * Open the EditSectionGroupReader for the appropriate file. */ public EditSectionGroupReader(NutchFileSystem nutchfs, String dbName, String label, int readerNum, int totalMachines) { this.nutchfs = nutchfs; this.dbName = dbName; this.label = label; this.readerNum = readerNum; this.totalMachines = totalMachines; } /** * Block until all contributions to the EditSection are present * and complete. To figure out how many contributors there are, * we load the meta-info first (which is written at section-create * time). */ private synchronized void sectionComplete() throws IOException { if (! sectionComplete) { // // Make sure that every contributor's file is present. // When all are present, we know this section is complete. // for (int i = 0; i < totalMachines; i++) { // Create the files we're interested in NutchFile allEditsDir = new NutchFile(nutchfs, dbName, "editsection." + readerNum, new File("editsdir." + i)); NutchFile editsDir = new NutchFile(allEditsDir, label); NutchFile editsList = new NutchFile(editsDir, "editslist"); NutchFile editsInfo = new NutchFile(editsDir, "editsinfo"); // Block until the editsInfo file appears File editsInfoFile = nutchfs.get(editsInfo); // Read in edit-list info DataInputStream in = new DataInputStream(new FileInputStream(editsInfoFile)); try { in.read(); // version this.numEdits += in.readInt(); // numEdits } finally { in.close(); } } sectionComplete = true; } } /** * Return how many edits there are in this section. This * method requires total section-completion before executing. */ public int numEdits() throws IOException { sectionComplete(); return numEdits; } /** * Merge all the components of the Section into a single file * and return the location. This method requires total section- * completion before executing. */ public File mergeSectionComponents() throws IOException { // Wait till all edit-contributors are done. sectionComplete(); // The merged destination file for this section File mergedEditsFile = nutchfs.getWorkingFile(); // // Figure out the keyclass // NutchFile allEdits0 = new NutchFile(nutchfs, dbName, "editsection." + readerNum, new File("editsdir." + 0)); NutchFile editsDir0 = new NutchFile(allEdits0, label); NutchFile editsList0 = new NutchFile(editsDir0, "editslist"); File editsListFile0 = nutchfs.get(editsList0); SequenceFile.Reader test = new SequenceFile.Reader(editsListFile0.getPath()); Class keyClass = null; try { keyClass = test.getKeyClass(); } finally { test.close(); } // // Now write out contents of each contributor's file // try { Writable key = (Writable) keyClass.newInstance(); SequenceFile.Writer out = new SequenceFile.Writer(mergedEditsFile.getPath(), keyClass, NullWritable.class); try { for (int i = 0; i < totalMachines; i++) { NutchFile allEditsDir = new NutchFile(nutchfs, dbName, "editsection." + readerNum, new File("editsdir." + i)); NutchFile editsDir = new NutchFile(allEditsDir, label); NutchFile editsList = new NutchFile(editsDir, "editslist"); File editsListFile = nutchfs.get(editsList); SequenceFile.Reader in = new SequenceFile.Reader(editsListFile.getPath()); try { while (in.next(key)) { out.append(key, NullWritable.get()); } } finally { in.close(); } } } finally { out.close(); } } catch (InstantiationException ie) { throw new IOException("Could not create instance of " + keyClass); } catch (IllegalAccessException iae) { throw new IOException("Could not create instance of " + keyClass); } return mergedEditsFile; } /** * Get rid of the edits encapsulated by this file. */ public void delete() throws IOException { for (int i = 0; i < totalMachines; i++) { // Delete the files we're interested in NutchFile editsDir = new NutchFile(nutchfs, dbName, "editsection." + readerNum, new File("editsdir." + i)); NutchFile consumedEdits = new NutchFile(editsDir, label); nutchfs.delete(consumedEdits); } } }